We look for enrichments in all the 90%-regions
import re, os, sys, pickle, pickle
from pathlib import Path
import numpy
import scipy
import pandas
from pandas import DataFrame, Series
from sklearn.decomposition import PCA
from collections import Counter, defaultdict
import random, bisect
random.seed(7)
import pyfaidx
# my own libaries
from GenomicWindows import window
import GenomicIntervals
numpy.random.seed(7)
Plotting setup:
%matplotlib inline
# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)
import mpld3
import seaborn as sns
sns.set() # sets seaborn default "prettyness:
sns.set_style("whitegrid")
sns.set_context("paper")
# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess
set1 = {'red': '#e41a1c', 'blue': '#377eb8', 'green': '#4daf4a',
'purple': '#984ea3', 'orange': '#ff7f00',
'yellow': '#ffff33', 'brown': '#a65628'}
Ignore deprecation warnings from mainly seaborn:
# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)
Analysis dirs:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
steps_dir = root_dir / 'steps'
argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
pi_dir = steps_dir / 'pi_stores'
dist_dir = steps_dir / 'dist_stores'
#pi_dir = root_dir / 'old_pi_stores'
male_x_haploid_dir = steps_dir / 'male_x_haploids'
Local code in the scripts dir on the cluster:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
sys.path.append(str(scripts_dir))
import simons_meta_data
import hg19_chrom_sizes
from toggle_code_and_errors import toggle_code_html, toggle_errors_html
Import variables global to the entire analysis:
import analysis_globals
def silent_nanmean(x):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return numpy.nanmean(x)
def ident_scalar(s):
x = s.unique()
assert(len(x)) == 1, x
return x[0]
# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=meta_data_dir)
pop_categories = pandas.read_hdf(str(results_dir / 'population_categories.store'), 'sr')
region_categories = pandas.read_hdf(str(results_dir / 'region_categories.store'), 'sr')
region_colors = dict(zip(list(region_categories),
['#e41a1c', '#377eb8', '#984ea3', '#4daf4a',
'#ff7f00', '#ffff33', '#a65628']))
chromosome_lengths = dict((k.replace('chr', ''), v) for k, v in hg19_chrom_sizes.hg19_chrom_sizes.items())
reference_genome_file = Path('/home', 'kmt', 'simons',
'faststorage', 'cteam_lite_public3', 'FullyPublic', 'Href.fa')
def genes_overlapping_intervals(genes, peaks):
lst = list()
for tup in peaks.itertuples():
df = genes.copy().loc[(genes.start < tup.end) & (genes.end > tup.start)]
df['peak_start'] = tup.start
df['peak_end'] = tup.end
lst.append(df)
return pandas.concat(lst)
def genes_in_intervals(genes, peaks):
pos = genes.start + (genes.end-genes.start)/2
idx = numpy.equal(numpy.searchsorted(peaks.start, pos) - 1, numpy.searchsorted(peaks.end, pos, side='left'))
return genes.copy().iloc[idx]
sweep_peaks = pandas.read_hdf(results_dir / 'sweep_peaks.hdf')
extended_peak_regions_subset = (pandas.read_hdf(results_dir / 'extended_peak_regions_90%.hdf')
)
extended_peak_regions_subset.head()
extended_peak_regions_subset['start'] = extended_peak_regions_subset.start_pos
extended_peak_regions_subset['end'] = extended_peak_regions_subset.end_pos
biomart_genes_x = pandas.read_hdf(results_dir / 'biomart_genes.hdf').loc[lambda df: df.chrom == 'X']
print(len(biomart_genes_x))
biomart_genes_x.head()
2237 genes show elevated expression in the testis compared to other tissues:
hpatlas_testis_elevated = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_elevated.tsv')
testis_elevated_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_elevated.Gene)]
len(testis_elevated_genes)
1079 enriched have at least five-fold higher mRNA levels compared to all other tissues
hpatlas_testis_enriched = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_Tissue.tsv')
testis_enriched_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_enriched.Gene)]
len(testis_enriched_genes)
overlap = genes_overlapping_intervals(testis_elevated_genes, extended_peak_regions_subset)
overlap
Fisher's exact test show no enrichment:
overlap_all = genes_overlapping_intervals(biomart_genes_x, extended_peak_regions_subset)
nr_prot_overlapping = len(overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'])
nr_prot_biomart = len(biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'])
table = [[len(overlap), len(testis_elevated_genes)-len(overlap)],
[nr_prot_overlapping, nr_prot_biomart-nr_prot_overlapping]]
print(table)
#scipy.stats.fisher_exact([[20, 100],[100, 1000]], alternative='greater')
scipy.stats.fisher_exact(table, alternative='greater')
overlap_all['Gene type'].value_counts()
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
display(overlap_all
#.loc[lambda df: df['Gene type'] == 'protein_coding']
.loc[lambda df: (df['Gene type'] != 'protein_coding') & (df['Gene type'] != 'pseudogene')]
.sort_values(['start']))
chalmel_genes = pandas.read_hdf(results_dir / 'chalmel_genes.hdf')
df = chalmel_genes.loc[lambda df: (df.Pattern.isin(['9', '10', '11', '12', '13'])) & (df.chrom == 'X'),
['name', 'chrom', 'start', 'end', 'Pattern', 'Expression in Testis']]
genes_overlapping_intervals(df, extended_peak_regions_subset)
trine_line_x_genes = pandas.read_hdf(results_dir / 'trine_line_x_genes.hdf')
genes_overlapping_intervals(trine_line_x_genes, extended_peak_regions_subset)
with open(str(results_dir / 'candidate_gene_list.txt'), 'w') as f:
for name in overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
print(name, file=f)
with open(str(results_dir / 'bacground_gene_list.txt'), 'w') as f:
for name in biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
print(name, file=f)
ampliconic_regions = pandas.read_hdf(results_dir / 'ampliconic_regions.hdf')
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'),
ampliconic_regions.loc[lambda df: df.chrom == 'X'],
chromosome_sizes=chromosome_lengths,
samples=1000)
Non-random distance to ampliconic reginos not overlapping: Are ampliconic regions not overlapping low pi regions closer the low pi regions than expected?
ampl_reg_not_overlapping = GenomicIntervals.interval_diff(ampliconic_regions.loc[lambda df: df.chrom == 'X'],
extended_peak_regions_subset.assign(chrom = 'X'))
GenomicIntervals.distance_stat(ampl_reg_not_overlapping, extended_peak_regions_subset.assign(chrom = 'X'))
human_chimp_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_chimp_low_ils_regions_chrX.hdf')
human_orang_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_orang_low_ils_regions_chrX.hdf')
Human-chimp ILS:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'),
human_chimp_low_ils_regions_chrX,
chromosome_sizes=chromosome_lengths,
samples=100000)
Human-orang ILS:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'),
human_orang_low_ils_regions_chrX,
chromosome_sizes=chromosome_lengths,
samples=1000)
sweep_data = pandas.read_hdf(results_dir / 'sweep_data.hdf')
missing_regions = pandas.read_hdf(results_dir / 'missing_regions.hdf')
plot_df = (sweep_data
.groupby(['start', 'end', 'region_1', 'region_label_1'])['swept']
.aggregate(['sum', 'size'])
.rename(columns={'sum': 'nr_swept', 'size': 'total'})
.reset_index(level=['start', 'end', 'region_1', 'region_label_1'])
)
plot_df.sort_values(by=['start', 'region_1'], inplace=True)
plot_df['cum_nr_swept'] = (plot_df
.loc[plot_df.region_label_1 != 'Africa']
.groupby(['start', 'end'])['nr_swept']
.transform('cumsum')
)
plot_df['cum_total'] = (plot_df
.loc[plot_df.region_label_1 != 'Africa']
.groupby(['start', 'end'])['total']
.transform('sum')
)
plot_df.head(7)
class ClickInfo(mpld3.plugins.PluginBase):
"""mpld3 Plugin for getting info on click """
JAVASCRIPT = """
mpld3.register_plugin("clickinfo", ClickInfo);
ClickInfo.prototype = Object.create(mpld3.Plugin.prototype);
ClickInfo.prototype.constructor = ClickInfo;
ClickInfo.prototype.requiredProps = ["id", "urls"];
function ClickInfo(fig, props){
mpld3.Plugin.call(this, fig, props);
};
ClickInfo.prototype.draw = function(){
var obj = mpld3.get_element(this.props.id);
urls = this.props.urls;
obj.elements().on("mousedown",
function(d, i){
window.open(urls[i], '_blank')});
}
"""
def __init__(self, points, urls):
self.points = points
self.urls = urls
if isinstance(points, matplotlib.lines.Line2D):
suffix = "pts"
else:
suffix = None
self.dict_ = {"type": "clickinfo",
"id": mpld3.utils.get_id(points, suffix),
"urls": urls}
#ucsc_search = "https://genome-euro.ucsc.edu/cgi-bin/hgTracks?hgsid=226837763_FKVw0jsAcbutCxMf8luSHlzwx2xW&org=Human&db=hg37&position={}&pix=1361"
ucsc_search = "https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg19&pix=2000&position={}&hgsid=230604418_7ASTqKyqNTW8yadJActGtpWwO1V2"
# fig, ax = plt.subplots()
# points = ax.scatter(numpy.random.rand(50), numpy.random.rand(50),
# s=500, alpha=0.3)
# urls = [ucsc_search.format('CCR5') for i in range(50)]
# # mpld3.plugins.connect(fig, ClickInfo(points, urls))
# # mpld3.display(fig)
with sns.axes_style("whitegrid", {'axes.grid' : False}):
# fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(13, 9),
# subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1)})
fig, ax1 = plt.subplots(1, 1, sharex=True, sharey=True, figsize=(13, 9),
subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1.1)})
zorder = 1
zorder += 1
genes = biomart_genes_x
pos_list = list()
labels = list()
for tup in genes.itertuples():
x = tup.start + (tup.end - tup.start) / 2
pos_list.append(x)
labels.append("{}".format(tup.name))
ax1.add_line(Line2D([x, x], [0, 1.1], color='lightgrey', zorder=zorder))
zorder += 1
scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='lightgrey', s=50, zorder=zorder)
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
mpld3.plugins.connect(fig, ClickInfo(scatter, urls))
# zorder += 1
# genes = chalmel_genes_subset
# pos_list = list()
# labels = list()
# for tup in genes.itertuples():
# x = tup.start + (tup.end - tup.start) / 2
# pos_list.append(x)
# labels.append("{} {}".format(tup.name, tup.Pattern))
# ax1.add_line(Line2D([x, x], [0, 1.1], color='red', zorder=zorder))
# zorder += 1
# scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='red', s=50, zorder=zorder)
# tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
# mpld3.plugins.connect(fig, tooltip)
# urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
# mpld3.plugins.connect(fig, ClickInfo(scatter, urls))
# zorder += 1
# genes = trine_line_x_genes
# pos_list = list()
# labels = list()
# for tup in genes.itertuples():
# x = tup.start + (tup.end - tup.start) / 2
# pos_list.append(x)
# labels.append("{}".format(tup.ovlpRepExon))
# ax1.add_line(Line2D([x, x], [0, 1.1], color='pink', zorder=zorder))
# zorder += 1
# scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='pink', s=50, zorder=zorder)
# tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
# mpld3.plugins.connect(fig, tooltip)
# urls = [ucsc_search.format(tup.ovlpRepExon) for tup in genes.itertuples()]
# mpld3.plugins.connect(fig, ClickInfo(scatter, urls))
regs = [x for x in plot_df.region_1.cat.categories if x != 'Africa'][::-1]
for reg in regs:
df = plot_df.loc[plot_df.region_label_1 == reg]
zorder += 1
for tup in df.itertuples():
if tup.nr_swept:
g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.cum_nr_swept/tup.cum_total,
facecolor=region_colors[reg],
linewidth=0,
edgecolor=None,#region_colors[reg],
zorder=zorder))
# df = plot_df.loc[plot_df.region_label_1 == 'Africa']
# for tup in df.itertuples():
# if tup.nr_swept:
# g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.nr_swept/tup.total,
# facecolor=region_colors['Africa'],
# edgecolor=None,#region_colors[reg],
# ))
zorder += 1
for tup in missing_regions.loc[missing_regions.is_missing == True].itertuples():
g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1,
facecolor='lightgray',
#edgecolor=None,
linewidth=0,
alpha=0.5,
zorder=zorder))
# g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1,
# facecolor='lightgray',
# edgecolor=None,
# alpha=0.5,
# zorder=zorder))
plt.savefig(str(figures_dir / "tmp2.pdf"))
#plt.close() # closing teh plot suppres automatic plotting without plt.show()
mpld3.display(fig)